# =============================================================================
# =============================================================================
# # Code to prep the Clarity Data 
# # This code is intended to get the raw clarity files ready to be reorganized into cohorts.
# # I create all of the dependent variables of interest and some of the heterogeneity variables (e.g. age, income, etc.)
# #    1. Import data, rename vars, change datatypes, general cleaning.
# #    2. Prep the ct clarity tradelines
# #    3. Prep the ci clarity inquiries
# =============================================================================
# =============================================================================

#Import the main datasets
ct = pd.read_parquet(cdd['p_d_cl'] + r'\Sample\ct_raw_sample.parquet') #tradelines
ci = pd.read_parquet(cdd['p_d_cl'] + r'\Sample\ci_raw_sample.parquet') #inquiries


#Clean the datatypes and rename
clarity_dict = pd.read_excel(cdd['p_d_cl_c'] + r'\ClarityVariables.xlsx', sheet_name='Codebook', engine='openpyxl')
clarity_dtypes = dict(zip(clarity_dict['Name'].to_list(),clarity_dict['NewType'].to_list()))
clarity_rn = dict(zip(clarity_dict['Name'].to_list(),clarity_dict['Rename'].to_list()))
ct=downsize(df=ct,types=clarity_dtypes).rename(columns=clarity_rn)
ci=downsize(df=ci,types=clarity_dtypes).rename(columns=clarity_rn)

#Status had some missing values to fill
ct['status'] = ct['status'].astype(str).replace('nan','').astype('category')

ct['id'] = pd.to_numeric(ct['cid'].str[3:])
ct = ct.reset_index().rename(columns={'index':'tid'})
ci['id'] = pd.to_numeric(ci['cid'].str[3:])
ct.drop(columns=['cid'],inplace=True)
ci.drop(columns=['cid'],inplace=True)

ci['iid'] = 1
ci['iid'] = ci['iid'].cumsum()
ct['tid'] = 1
ct['tid'] = ct['tid'].cumsum()


# Make a dictionary from event_date to quarter dates
datesm = [dt.datetime(year=y,month=m,day=1) - dt.timedelta(days=1)  for m in [1,2,3,4,5,6,7,8,9,10,11,12] for y in range(1999,2024)]
datesq = [dt.datetime(year=y,month=m,day=1) - dt.timedelta(days=1)  for m in [1,4,7,10] for y in range(1999,2024)]
datev = pd.DataFrame(pd.date_range(start="2000-01-01",end="2022-12-31")).rename(columns={0:'date'})
datev['date_postm'] = datev['date'].apply(lambda x: min([v for v in datesm if v>=x]))
datev['date_prem'] = datev['date'].apply(lambda x: max([v for v in datesm if v<x]))
datev['date_postq'] = datev['date'].apply(lambda x: min([v for v in datesq if v>=x]))
datev['date_preq'] = datev['date'].apply(lambda x: max([v for v in datesq if v<x]))
date_post_dict = {'m':dict(zip(datev['date'].tolist(),datev['date_postm'].tolist())),
                  'q':dict(zip(datev['date'].tolist(),datev['date_postq'].tolist()))}
date_pre_dict = {'m':dict(zip(datev['date'].tolist(),datev['date_prem'].tolist())),
                 'q':dict(zip(datev['date'].tolist(),datev['date_preq'].tolist()))}



actd = {'C1':'Internet SPML','1C':'Internet SPML ?','C2':'Storefront SPML','C3':'Online Installment','C4':'Check Cashing Credit',
        'C5':'Online Rent-to-Own ','C6':'Storefront Installment','C7':'Internet Line of Credit','C8':'Storefront Line of Credit',
        'C9':'Auto','C10':'Online Title Loan','C11':'Storefront Title Loan','C12':'Market place lending','C13':'Storefront Rent to Own',
        'C36':'Retail Store Payment','C50':'Credit Monitor','C99':'Other Bill Payment','4D':'Telecom',
        '00':'Traditional Auto','01':'Unsecured','02':'Secured','06':'Installment Sales Contract',
        '13':'Lease','15':'Line of Credit','20':'Note Loan','18':'Credit Card','10':'Business Loan','07':'Charge Account',
        '2A':'Secured Credit Card Debt', '90':'Medical Debt'}

# itypes = ['C3','C1','C5','C9','C13','01','C6','4D','C11','C2']
# ttypes = ['C1','C3','C2','01','C6','C5']
itypes = ['C3','C1','C5','C9','C13','C6','C2']
ttypes = ['C1','C3','C2','01','C6','C5']

#Last day of data.
dsend = dt.datetime(2020,12,31)

#Subset the clarity data to just the inquiry and transaction types that are best populated
ci = ci[ci.typea.isin(itypes)]
ct = ct[ct.typea.isin(ttypes)]

ct.to_parquet(cdd['p_d_cl_c'] + r'\Tradelines.parquet')
ci.to_parquet(cdd['p_d_cl_c'] + r'\Inquiries.parquet')


# =============================================================================
# Create the panel version of tradelines
# =============================================================================
ct = pd.read_parquet(cdd['p_d_cl_c'] + r'\Tradelines.parquet')

#Homogenize some of the notation.
freqdict = {'B':'14','E':'15','L':'60','M':'30','P':np.nan,'S':np.nan,'T':'120','W':'7'}
ct['freqr'] = pd.to_numeric(np.where(ct['freq'].isin(freqdict.keys()),ct['freq'].map(freqdict),ct['freq']))

#Flesh out some of the date information.
ct['freq'] = np.where(ct.typep.isin(['I','N']),ct['freqr'].fillna(30),ct['freqr'])
ct['date_df'] = ct['date_df'].fillna(ct['open'] + dt.timedelta(days=1) * (ct['freq']+0))
ct['freq'] = np.where((np.isnan(ct['freq']))&((ct['date_df'] - ct['open']).dt.days <= 30),(ct['date_df'] - ct['open']).dt.days,ct['freq'])
ct['freq'] = ct['freq'].fillna(30)
ct['openr'] = ct['open']
ct['open'] = ct['date_df'] - dt.timedelta(days=1) * (ct['freq']+0)
ct['close'] = ct['close'].fillna(ct['date_del'])
ct['close'] = np.where(ct['close']==ct['open'], ct['close'] + dt.timedelta(days=1), ct['close'])
ct['duration'] = pd.to_numeric(ct['duration'],errors='coerce')
ct['openm'] = ct['open'].map(date_post_dict['m'])
ct['openq'] = ct['open'].map(date_post_dict['q'])

#Note there are a lot of outlier observations and they appear to be mostly installment loans that have a single expected payment
#   I don't know if these are revolving products or if the duration was missing
#   I believe they are misclafied as installment when they should be SPML
ct['typear'] = ct['typea'] 
ct['typea'] = np.where((ct['typea'].isin(['C3','C6']))&(ct['duration']==1),ct['typea'].map({'C3':'C1','C6':'C2'}),ct['typea'])
ct['typea'] = np.where((ct['typea'].isin(['C1','C2']))&(ct['duration']>1),ct['typea'].map({'C1':'C3','C2':'C6'}),ct['typea'])

#Simplify the status variables
ct['rep_form'] = (ct.status == '')*1 #Status is blank when they report in clarity form.
ct['st_curr'] = np.where((ct.status.isin(['11']))|((ct.pmt_rating.isin(['0']))&(ct.rep_form==1)),1,0)
ct['st_gc'] = np.where((ct.status.isin(['13']))|((ct.pmt_rating.isin(['@']))&(ct.rep_form==1)),1,0)
ct['st_del'] = np.where((ct.status.isin(['71','78','80','82','83','84']))|
                    ((ct.pmt_rating.isin([ '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
                                           'G', 'H', 'I', 'J', 'K', 'L', 'm', 'N', 'P', 'q', 'R', 'S', 'T', 'U', 'V',
                                           'W', 'X', 'Y', 'Z']))&(ct.rep_form==1))|
                    ((ct.pmt_rating.isin(['1', '2', '3', '4', '5', '6']))&(ct.rep_form==0)),1,0)
ct['st_col'] = np.where((ct.status.isin(['93','95','96','61','62','63','65','71','78','80','82','83','84','DF']))|
                    ((ct.pmt_rating.isin(['#']))&(ct.rep_form==1))|((ct.pmt_rating.isin(['G']))&(ct.rep_form==0)),1,0)
ct['st_co'] = np.where((ct.status.isin(['97','64']))|
                    ((ct.pmt_rating.isin(['+']))&(ct.rep_form==1))|((ct.pmt_rating.isin(['L']))&(ct.rep_form==0)),1,0)
ct['st_transfered'] = np.where(ct.status.isin(['05']),1,0)
ct['st_gc'] = np.where((ct.st_gc==0)&(ct.st_curr==0)&(ct.st_del==0)&(ct.st_col==0)&(ct.st_co==0),1,ct.st_gc)


# Look at a second way to organize the transaction data. 
#   Each transaction is a single row. Calculate some measures of maturity and exit
ct['typeas'] = ct['typea'].map(actd).astype('category')
ct['maturity'] = ct['duration'] * ct['freq']
# ct['closet'] = ct['open'] + ct['maturity'] * dt.timedelta(days=1) #Note: This is the line actually used
ct['closet'] = ct['open'] + ct['maturity'].apply(lambda x: min(x,1) if x>0 else 0) * dt.timedelta(days=1) #This line is an adaptation necessary due to the random cell level sampling. Sometimes maturity is randomly very large.
ct['duration_e2a'] = (ct['close'] - ct['closet']).dt.days / ct['maturity']


ctkvars = ['id', 'tid', 'state', 'zip', 'typep', 'typea', 'typear', 'typeas', 'open', 'openm', 'openq', 'close', 'date_df', 'date_lp', 'date_del',
             'closet', 'maturity', 'duration_e2a', 'status', 'duration', 'freq', 'freqr', 'bal_highprin', 'bal_curr', 
             'bal_del', 'pmt_sch', 'pmt_act', 'chargeoff', 'pmt_rating', 'openr', 'rep_form', 'st_curr',
             'st_gc', 'st_del', 'st_col', 'st_co', 'st_transfered']

ct[ctkvars].to_parquet(cdd['p_d_cl_c'] + r'\ct.parquet')


ct = pd.read_parquet(cdd['p_d_cl_c'] + r'\ct.parquet')
ct.groupby('typea')['bal_highprin'].mean()



#For each loan blow it out into a daily time series
def panel_helper(df,datesd,datesmq,target,g,frequency):
    #Build panel
    ctf = bal_panel({'tid':df['tid'].unique(),'dated':datesd})
    df['close_post'] = df['close'].map(date_post_dict[frequency]).fillna(date_post_dict[frequency][dsend])
    df['open_pre'] = df['open'].map(date_pre_dict[frequency])
    ctf = pd.merge(ctf,df,how='left',on='tid')
    ctf = ctf[(ctf.dated > ctf.open_pre)&(ctf.dated <= ctf.close_post)]
    ctf['date'] = ctf['dated'].map(datesmq)
    
    #Null out some loan characteristics so that they won't count toward the aggregates before open or after close.
    for v in ['tid','typep','typea','bal_highprin','freq']:
        ctf[v] = np.where((ctf.dated>ctf.close)|(ctf.dated<ctf.open),np.nan,ctf[v])
    
    #The number of loans and loan balance characteristics
    ctf['ntrans'] = ctf.groupby(['id','date'])['tid'].transform('nunique')
    ctf['ntransa'] = ctf.groupby(['id','date'])['typea'].transform('nunique')
    ctf['ntrans_dc'] = ctf.groupby(['id','dated'])['tid'].transform('nunique').replace(0,np.nan)
    ctf['ntrans_d'] = ctf['ntrans_dc'].fillna(0)
    ctf['ch_dc'] = ctf.groupby(['id','dated'])['bal_highprin'].transform('sum',min_count=1)
    ctf['ch_d'] = ctf['ch_dc'].fillna(0)
    ctf['freq_dc'] = ctf.groupby(['id','dated'])['freq'].transform('mean')
    ctf['freq_d'] = ctf['freq_dc'].fillna(0)
    ctf['anytrans_d'] = (ctf.groupby(['id','dated'])['tid'].transform('nunique') > 0)*1
    ctf_m = ctf.drop_duplicates(subset=['id','dated']).groupby(['id','date'])[['ntrans_dc','ch_dc','ntrans_d','ch_d','freq_dc','anytrans_d']].mean().reset_index()
    ctf_m.rename(columns={'ntrans_dc':'ntrans_dcmean','ch_dc':'ch_dcmean','ntrans_d':'ntrans_dmean','ch_d':'ch_dmean','freq_dc':'freq_dcmean','anytrans_d':'anytrans_dmean'},inplace=True)
    ctf_max = ctf.drop_duplicates(subset=['id','dated']).groupby(['id','date'])[['ntrans_dc','ch_dc']].max().reset_index()
    ctf_max.rename(columns={'ntrans_dc':'ntrans_dcmax','ch_dc':'ch_dcmax'},inplace=True)
    # #look at loan outcomes: delinquency / good close / rollover / collections / chargeoff
    # for v in ['st_gc','st_del','st_col','st_co']:
    #     ctf[v+'_tid'] = np.where((ctf[v]==1)&(ctf['close']<=ctf['close_post'])&(ctf['close']>ctf['close_post']),ctf['tid'],np.nan)
    #     ctf[v+'_f'] = ctf.groupby(['id','date'])[v+'_tid'].transform('nunique').fillna(0) / ctf['ntrans']
    
    #####The number of loans and loan balance by loan account type (conditional on there being a loan)
    ctf_ca = ctf.groupby(['id','dated','date','typea']).agg(ntrans_dca=('tid','nunique'),ch_dca=('bal_highprin','sum'),freq_dca=('freq','mean')).reset_index()
    ctf_ca = ctf_ca.pivot(index=['id','dated','date'],columns=['typea'],values=['ntrans_dca','ch_dca','freq_dca']).reset_index()
    ctf_ca.columns = [''.join(col).strip() for col in ctf_ca.columns]
    #Take the mean within quarter
    ctf_cam = ctf_ca.groupby(['id','date'])[[v for v in list(ctf_ca) if any(re.findall(r'(ntrans_|ch_|freq_)',v))]].mean().reset_index()
    
    #####The number of loans and loan balance by loan account type (unconditional)
    ctf_a = ctf.groupby(['id','dated','date','typea']).agg(ntrans_da=('tid','nunique'),ch_da=('bal_highprin','sum')).reset_index()
    ctf_a[[ 'ntrans_da', 'ch_da']] = ctf_a[[ 'ntrans_da', 'ch_da']].fillna(0)
    ctf_a = ctf_a.pivot(index=['id','dated','date'],columns=['typea'],values=['ntrans_da','ch_da']).reset_index()
    ctf_a.columns = [''.join(col).strip() for col in ctf_a.columns]
    ctf_a = pd.merge(ctf[['id','dated','date']].drop_duplicates(),ctf_a,how='left',on=['id','dated','date'])
    #Fill missing trans and high credit with zeros
    ctf_a[[v for v in list(ctf_a) if any(re.findall(r'(ntrans_|ch_)',v))]] = ctf_a[[v for v in list(ctf_a) if any(re.findall(r'(ntrans_|ch_)',v))]].fillna(0)
    #Take the mean within quarter
    ctf_am = ctf_a.groupby(['id','date'])[[v for v in list(ctf_a) if any(re.findall(r'(ntrans_|ch_|freq_)',v))]].mean().reset_index()
    
    #Pull all the variables together in a quarterly dataset.
    kvars = ['id','date','dated','ntrans','ntransa','ntrans_d','ch_d','freq_d']
    ctf_f = ctf[ctf.date==ctf.dated][kvars].drop_duplicates(subset=['id','date'])
    ctf_f = reduce(lambda df1,df2: pd.merge(df1,df2,how='outer',on=['id','date']), [ctf_f, ctf_m, ctf_max, ctf_cam, ctf_am])
    ctf_f.drop(columns=['dated'],inplace=True)
    # return ctf_f.to_dict(orient='records')
    ctf_f.to_parquet(cdd['p_d_cl_c'] + r'\ctf_parts\ctf_p'+str(g)+'_'+frequency+'.parquet')
    return  ['success']


tidl = list(ct.id.unique())
tidlg = split(tidl,50)
kvars = ['id', 'tid', 'typep', 'typea', 'open', 'close', 'bal_highprin', 'freq']
dates_daily = pd.date_range(start='6/1/2013', end='12/31/2020').tolist()

for f in ['m','q']:
    temp = Parallel(n_jobs=5)(delayed(panel_helper)(df = ct[ct.id.isin(tidlg[g])][kvars].copy(), datesd=dates_daily, datesmq=date_post_dict[f], target=tidlg[g], g=g, frequency=f) for g in tqdm(range(len(tidlg))))

for f in ['m','q']:
    files = [file for file in os.listdir(cdd['p_d_cl_c'] + r'\ctf_parts') if any(re.findall(r'_' + f,file,flags=re.IGNORECASE))]
    temp = Parallel(n_jobs=20)(delayed(pd.read_parquet)(cdd['p_d_cl_c'] + r'\ctf_parts\\'+f) for f in tqdm(files))
    ctf = pd.concat(temp, ignore_index=True,sort=False)
    ctf.to_parquet(cdd['p_d_cl_c'] + r'\ctf_'+f+'.parquet')






# =============================================================================
# Create the panel version of inquiries
# =============================================================================
ci = pd.read_parquet(cdd['p_d_cl_c'] + r'\Inquiries.parquet')
ci['datem'] = ci['date_ir'].map(date_post_dict['m'])
ci['dateq']= ci['date_ir'].map(date_post_dict['q'])


#Trim upper bounds that fall below the 75th pct of lower bounds and lower bounds above the 25th pct of upper bounds (takes about 8min)
def DoB(ci,a=0.9,b=0.1):
    ci.sort_values(by=['id','date_ir'],inplace=True)
    ci['DoB_above'] = ci['date_ir'] - dt.timedelta(days = 365, hours = 6) * (ci['age'] + 0)
    ci['DoB_below'] = ci['date_ir'] - dt.timedelta(days = 365, hours = 6) * (ci['age'] + 1)
    ci['DoB_above_g'] = pd.to_datetime(np.where(ci['DoB_above'] > ci.groupby('id')['DoB_below'].transform(lambda x: x.quantile(a)),ci['DoB_above'],pd.NaT))
    ci['DoB_below_g'] = pd.to_datetime(np.where(ci['DoB_below'] < ci.groupby('id')['DoB_above'].transform(lambda x: x.quantile(b)),ci['DoB_below'],pd.NaT))
    ci['DoB_above_g'] = ci.groupby('id')['DoB_above_g'].transform('min')
    ci['DoB_below_g'] = ci.groupby('id')['DoB_below_g'].transform('max')
    ci['dob'] = ci['DoB_below_g'] + (ci['DoB_above_g'] - ci['DoB_below_g'])/2
    ci.drop(columns=['DoB_above','DoB_below','DoB_above_g','DoB_below_g'],inplace=True)
    return ci[['id','dob']].drop_duplicates().to_dict(orient='records')

tidlg = split(list(ci.id.unique()),50)
kvars = ['id','iid','date_ir','age']
ci_dob = Parallel(n_jobs=10)(delayed(DoB)(ci = ci[ci.id.isin(tgt)][kvars].copy()) for tgt in tqdm(tidlg))
ci_dob = pd.DataFrame.from_dict([item for sublist in ci_dob for item in sublist], orient='columns')
ci_dob.to_parquet(cdd['p_d_cl_c'] + r'\ci_dob.parquet')
ci_dob = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_dob.parquet')

#Simplify housing and pay frequency
housedict = {'':np.nan,'OTHER':1,'PARENTS':2,'FAMILY':3,'FRIEND':4,'RENT':5,'RENTER':5,'OWN':6}
ci['housing'] = pd.to_numeric(ci['housing'].map(housedict))
pay_freq_dict = {'':np.nan,'BIWEEKLY':14,'BI WEEKLY':14,'SEMIMONTHLY':14,'B':14,'BI_WEEKLY':14,'BI-WEEKLY':14,'SEMI MONTHLY':14,'TWICE_MONTHLY':14,'BW':14,'TWICE MO':14,'TWICE A MONTH':14,'EVERY OTHER WEEK':14,'TWICEMONTHLY':14,'SM':14,'WEEKLY':7,'W':7,'MONTHLY':30,'m':30,'BI-MONTHLY':60}
ci['pay_freq'] = pd.to_numeric(ci['pay_freq'].map(pay_freq_dict))

ci = pd.merge(ci,ci_dob,how='left',on='id')
ci['dor'] = ci['date_ir'] - dt.timedelta(days=30) * (ci['reside'] + 0)
ci.to_parquet(cdd['p_d_cl_c'] + r'\ci.parquet')
ci.info(memory_usage='deep')


#Get some descriptive information for each date_ir
def helper(df,date):
    df_desc = []
    for v in ['housing','pay_freq','dor']:
        temp = df.groupby(['id',date],observed=True)[v].value_counts().rename(v+'_c').reset_index()
        temp = temp[temp[v+'_c']==temp.groupby(['id',date])[v+'_c'].transform('max')].drop_duplicates(subset=['id',date])
        temp.drop(columns=[v+'_c'],inplace=True)
        df_desc += [temp.rename(columns={'level_2':v})]
    df_desc += [df.groupby(['id',date])['income'].mean().reset_index()]
    df_desc += [df[['id',date,'dob']].drop_duplicates(subset=['id',date])]
    df_desc = reduce(lambda df1,df2: pd.merge(df1,df2,how='outer',on=['id',date]), df_desc)
    return df_desc

ci = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci.parquet')
tidlg = split(list(ci.id.unique()),50)
for f in ['m','q']:
# for f in ['q']:
    date = 'date'+f
    kvars = ['id',date,'zip','state','housing','pay_freq','dor','income','dob']
    ci_desc = pd.concat(Parallel(n_jobs=10)(delayed(helper)(df = ci[ci.id.isin(tgt)][kvars].copy(),date=date) for tgt in tqdm(tidlg)), ignore_index=True, sort=False)
    ci_desc.to_parquet(cdd['p_d_cl_c'] + r'\ci_desc_'+f+'.parquet')




ci = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci.parquet')
tidlg = split(list(ci.id.unique()),10)
#Calculate the distance between this inquiry and the next inquiry (both within product type and globally)
#   Also, calculate the number of inquiries submitted within the next X days unconditionally and condional on no activity in the last year
hor1 = [7,15] #[7,15,30,90,180,360]
hor2 = [3,9] #[3,6,9,12,15,18,21,24,27,30,45,60,90,180,270,360,540,730]
hor2_marg = {v:min([x for x in hor2 if x>v]) for v in hor2[:-1]}
ci_s = ci.groupby(['id','date_ir','datem','dateq','typea'],observed=True)['iid'].count().rename('inq_na').reset_index()
ci_s = ci_s.sort_values(by=['id','typea','date_ir'],ascending=[1,1,1])
ci_s['date_dela'] = (ci_s.groupby(['id','typea'])['date_ir'].shift(-1).fillna(dsend) - ci_s['date_ir']).dt.days
ci_s['date_dela_dsend'] = (dsend - ci_s['date_ir']).dt.days #If the date_dela is less than date_dela_dsend and the horizon of interest then we are good, else replace with missing.
def helper(df):
    df = df.sort_values(by=['id','typea','date_ir'],ascending=[1,1,1]).set_index(['date_ir'])
    for v in hor1:
        df['inq_na_past' + str(v)] = df.groupby(['id','typea']).rolling(dt.timedelta(days=v+1))['inq_na'].count().reset_index().set_index(['date_ir'])['inq_na'] - 1
    df = df.reset_index().sort_values(by=['id','typea','date_ir'],ascending=[1,1,0]).set_index(['date_ir'])
    for v in hor2:
        df['inq_na_' + str(v)] = df.groupby(['id','typea']).rolling(dt.timedelta(days=v+1))['inq_na'].count().reset_index().set_index(['date_ir'])['inq_na'] - 1
    df = df.reset_index().sort_values(by=['id','typea','date_ir'],ascending=[1,1,1])
    return df
ci_s = pd.concat(Parallel(n_jobs=10)(delayed(helper)(df = ci_s[ci_s.id.isin(tgt)].copy()) for tgt in tqdm(tidlg)),ignore_index=True,sort=False) 
for v in hor2_marg.keys():
    ci_s['inq_nam_'+str(v) + '_' + str(hor2_marg[v])] = 1*(ci_s['inq_na_' + str(hor2_marg[v])] - ci_s['inq_na_' + str(v)] > 0)
ci_s['inq_nam_1_' + str(hor2[0])] = 1*(ci_s['inq_na_' + str(hor2[0])] > 0)



#Make sure to drop observations n/nm/na/nam if the post period is after the end of the dataset
for v in hor2_marg.keys():
    ci_s['inq_nam_'+str(v) + '_' + str(hor2_marg[v])] =  np.where(ci_s['date_ir'] + dt.timedelta(days = v) > dsend,np.nan,ci_s['inq_nam_'+str(v) + '_' + str(hor2_marg[v])])
ci_s['typeas'] = ci_s.typea.map(actd).astype('category')
ci_s.to_parquet(cdd['p_d_cl_c'] + r'\ci_s.parquet')
ci_s.info(memory_usage='deep')
ci_s = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_s.parquet')



ci_ss = ci.groupby(['id','date_ir','datem','dateq'],observed=True)['iid'].count().rename('inq_n').reset_index()
ci_ss.sort_values(by=['id','date_ir'],inplace=True)
ci_ss['date_del'] = (ci_ss.groupby(['id'])['date_ir'].shift(-1).fillna(dsend) - ci_ss['date_ir']).dt.days
ci_ss['date_del_dsend'] = (dsend - ci_ss['date_ir']).dt.days
def helper(df):
    df = df.sort_values(by=['id','date_ir'],ascending=[1,1]).set_index(['date_ir'])
    for v in hor1:
        df['inq_n_past' + str(v)] = df.groupby(['id']).rolling(dt.timedelta(days=v+1))['inq_n'].count().reset_index().set_index(['date_ir'])['inq_n'] - 1
    df = df.reset_index().sort_values(by=['id','date_ir'],ascending=[1,0]).set_index(['date_ir'])
    for v in hor2:
        df['inq_n_' + str(v)] = df.groupby(['id']).rolling(dt.timedelta(days=v+1))['inq_n'].count().reset_index().set_index(['date_ir'])['inq_n'] - 1
    df = df.reset_index().sort_values(by=['id','date_ir'],ascending=[1,1])
    return df
ci_ss = pd.concat(Parallel(n_jobs=10)(delayed(helper)(df = ci_ss[ci_ss.id.isin(tgt)].copy()) for tgt in tqdm(tidlg)),ignore_index=True,sort=False) 
for v in hor2_marg.keys():
    ci_ss['inq_nm_'+str(v) + '_' + str(hor2_marg[v])] = 1*(ci_ss['inq_n_' + str(hor2_marg[v])] - ci_ss['inq_n_' + str(v)] > 0)
ci_ss['inq_nm_1_' + str(hor2[0])] = 1*(ci_ss['inq_n_' + str(hor2[1])] > 0)
#Make sure to drop observations n/nm/na/nam if the post period is after the end of the dataset
for v in hor2_marg.keys():
    ci_ss['inq_nm_'+str(v) + '_' + str(hor2_marg[v])] =  np.where(ci_ss['date_ir'] + dt.timedelta(days = v) > dsend,np.nan,ci_ss['inq_nm_'+str(v) + '_' + str(hor2_marg[v])])
ci_ss.to_parquet(cdd['p_d_cl_c'] + r'\ci_ss.parquet')
ci_ss.info(memory_usage='deep')


for f in ['m','q']:
    date = 'date' + f
    #Aggregate the inquiries by individual by period
    ci_agg = ci.groupby(['id',date,'typea'],observed=True)['iid'].count().rename('inq').reset_index()
    ci_agg = ci_agg.pivot(index=['id',date],columns='typea',values='inq')
    ci_agg.rename(columns={v:'inq_'+v for v in list(ci_agg)},inplace=True)
    #Lump inquiries within a 1, 3, and 7 day period and typea as a single inq event and look at the number of distinct inquiry "events".
    ci_sagg = ci.groupby(['id','date_ir',date,'typea'],observed=True)['iid'].count().rename('inq_nac').reset_index()
    def helper(df):
        df = df.sort_values(by=['id','typea','date_ir'],ascending=[1,1,1]).set_index(['date_ir'])
        for v in hor2:
            df['inq_na_past' + str(v)] = df.groupby(['id','typea']).rolling(dt.timedelta(days=v+1))['inq_nac'].count().reset_index().set_index(['date_ir'])['inq_nac'] - 1
        df = df.reset_index().sort_values(by=['id','typea','date_ir'],ascending=[1,1,1])
        return df
    ci_sagg = pd.concat(Parallel(n_jobs=10)(delayed(helper)(df = ci_sagg[ci_sagg.id.isin(tgt)].copy()) for tgt in tqdm(tidlg)),ignore_index=True,sort=False) 
    for v in hor2:
        ci_sagg['inq_nac' + str(v)] = 1*(ci_sagg['inq_na_past' + str(v)] == 0)
    ci_sagg['inq_nac'] =1*(ci_sagg['inq_nac'] > 0)
    ci_sagg = ci_sagg.groupby(['id',date,'typea'],observed=True)[['inq_nac']+['inq_nac'+str(v) for v in hor2]].sum().reset_index()
    ci_saggp = ci_sagg.pivot(index=['id',date],columns='typea',values=['inq_nac']+['inq_nac'+str(v) for v in hor2])
    ci_saggp.columns = ['_'.join(col).strip() for col in ci_saggp.columns.values]
    ci_saggp.reset_index(inplace=True)
    #Merge the two datasets of extensive margin inquiries and export
    ci_agg = pd.merge(ci_agg,ci_saggp, how='left',on=['id',date])
    ci_agg.to_parquet(cdd['p_d_cl_c'] + r'\ci_agg_'+f+'.parquet')
    ci_agg.head(10).to_parquet(cdd['p_d_cl_c'] + r'\ci_agg_'+f+'_s.parquet')
    ci_agg.info(memory_usage='deep')



ci = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci.parquet')
idlist = list(ci.id.unique())
nparts = {'q':10,'m':20}
for f in ['m','q']:
    date = 'date' + f
    ci_agg = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_agg_'+f+'.parquet')
    ci_desc = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci_desc_'+f+'.parquet')
    
    tidlg = split(idlist,nparts[f])
    for i in tqdm(range(len(tidlg))):
        #Merge the descriptive and inquiry information and fill in missing dates 
        cif = bal_panel({'id':tidlg[i],date:[v for v in ci[date].unique() if (v>dt.datetime(2013,12,1))&(v<dt.datetime(2021,1,1))]})
        # cif = pd.merge(cif,ci_agg, how='left',on=['id',date])
        cif = pd.merge(cif,ci_agg[['id',date,'inq_C9','inq_C3','inq_C6','inq_C5','inq_C1','inq_C13','inq_C2']], how='left',on=['id',date])
        cif = pd.merge(cif,ci_desc, how='left',on=['id',date])
        cif.info(memory_usage='deep')
        
        #The fill operation takes about 3 minutes
        cif = fill_eff(df = cif, vfill=ldiff(list(ci_desc),['id', date, 'reside']), gp=['id'], sort={'svars':['id',date],'sort1':[1, 1],'sort2':[1, 0]},par=True,par_ng=100)
        cif['age'] = np.floor((cif[date] - cif['dob']).dt.days/365)
        cif['reside'] = np.floor((cif[date] - cif['dor']).dt.days/30)
        cif['reside'] = np.where(cif['reside']>=0, cif['reside'], np.nan)
        
        #Fill the inquiries with zeros
        cif[ldiff(lint(list(cif),list(ci_agg)),['id', date])] = cif[ldiff(lint(list(cif),list(ci_agg)),['id', date])].fillna(0)
        cif.to_parquet(cdd['p_d_cl_c'] + r'\cif_parts\cif_p'+str(i)+'_'+f+'.parquet')


for f in ['m','q']:
    files = [file for file in os.listdir(cdd['p_d_cl_c'] + r'\cif_parts') if any(re.findall(r'_' + f,file,flags=re.IGNORECASE))]
    temp = Parallel(n_jobs=5)(delayed(pd.read_parquet)(cdd['p_d_cl_c'] + r'\cif_parts\\'+f) for f in tqdm(files))
    cif = pd.concat(temp, ignore_index=True, sort=False)
    cif.to_parquet(cdd['p_d_cl_c'] + r'\cif_'+f+'.parquet')


# =============================================================================
# Create location info based on both the transactions and inquiries files.
# =============================================================================
ct = pd.read_parquet(cdd['p_d_cl_c'] + r'\ct.parquet')
ci = pd.read_parquet(cdd['p_d_cl_c'] + r'\ci.parquet')
ct['sourceit'] = 1
ci['sourceit'] = 0


for f in ['m','q']:
    date = 'date' + f
    cloc = pd.concat([ci[['id','date_ir','sourceit','zip','state']].rename(columns={'date_ir':'date'}),ct[['id','open','zip','state']].rename(columns={'open':'date'})],ignore_index=True,sort=False)
    cloc[date] = cloc['date'].map(date_post_dict[f])
    cloc.reset_index(drop=True,inplace=True)
    cloc.sort_values(by=['id','date'],inplace=True)
    
    cloc_desc = []
    for v in ['state','zip']:
        temp = cloc.groupby(['id',date])[v].value_counts().rename(v+'_c').reset_index()
        temp = temp[temp[v+'_c']==temp.groupby(['id',date])[v+'_c'].transform('max')].drop_duplicates(subset=['id',date])
        temp.drop(columns=[v+'_c'],inplace=True)
        cloc_desc += [temp]
    cloc_desc = reduce(lambda df1,df2: pd.merge(df1,df2,how='outer',on=['id',date]), cloc_desc)
    cloc = bal_panel({'id':cloc_desc.id.unique(),date:cloc[date].unique()})
    cloc = pd.merge(cloc, cloc_desc, how='left',on=['id',date])
    cloc = fill_eff(df = cloc, vfill=['zip','state'], gp=['id'], sort={'svars':['id',date],'sort1':[1, 1],'sort2':[1, 0]},par=True,par_ng=10)
    cloc['state'] = cloc['state'].astype('category')
    cloc.info(memory_usage='deep')
    cloc.to_parquet(cdd['p_d_cl_c'] + r'\cloc_'+f+'.parquet')

















